{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ae8e2ae0-da25-44e7-ae82-024173150a15",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests \n",
    "\n",
    "docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'\n",
    "docs_response = requests.get(docs_url)\n",
    "documents_raw = docs_response.json()\n",
    "\n",
    "documents = []\n",
    "\n",
    "for course in documents_raw:\n",
    "    course_name = course['course']\n",
    "\n",
    "    for doc in course['documents']:\n",
    "        doc['course'] = course_name\n",
    "        documents.append(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "57de60e5-b96c-499c-a7cf-0f30fc33b324",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': \"Yes, even if you don't register, you're still eligible to submit the homeworks.\\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\",\n",
       " 'section': 'General course-related questions',\n",
       " 'question': 'Course - Can I still join the course after the start date?',\n",
       " 'course': 'data-engineering-zoomcamp'}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ce7d0d18-5c07-4010-9f90-bbd021f110c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\alexe\\miniconda3\\Lib\\site-packages\\sklearn\\utils\\_param_validation.py:11: UserWarning: A NumPy version >=1.23.5 and <2.3.0 is required for this version of SciPy (detected version 2.3.0)\n",
      "  from scipy.sparse import csr_matrix, issparse\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<minsearch.minsearch.Index at 0x24a2177c320>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import minsearch\n",
    "\n",
    "index = minsearch.Index(\n",
    "    text_fields=[\"question\", \"text\", \"section\"],\n",
    "    keyword_fields=[\"course\"]\n",
    ")\n",
    "\n",
    "index.fit(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "aa755a08-b98d-4e92-8994-04e6108499d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "\n",
    "openai_client = OpenAI()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b21237c3-80e9-429c-a089-d45428087046",
   "metadata": {},
   "outputs": [],
   "source": [
    "def search(query):\n",
    "    boost = {'question': 3.0, 'section': 0.5}\n",
    "\n",
    "    results = index.search(\n",
    "        query=query,\n",
    "        filter_dict={'course': 'data-engineering-zoomcamp'},\n",
    "        boost_dict=boost,\n",
    "        num_results=5\n",
    "    )\n",
    "\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8cc5784e-6515-42e5-be62-8fb915df1088",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_prompt(query, search_results):\n",
    "    prompt_template = \"\"\"\n",
    "You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n",
    "Use only the facts from the CONTEXT when answering the QUESTION.\n",
    "\n",
    "QUESTION: {question}\n",
    "\n",
    "CONTEXT: \n",
    "{context}\n",
    "\"\"\".strip()\n",
    "\n",
    "    context = \"\"\n",
    "    \n",
    "    for doc in search_results:\n",
    "        context = context + f\"section: {doc['section']}\\nquestion: {doc['question']}\\nanswer: {doc['text']}\\n\\n\"\n",
    "    \n",
    "    prompt = prompt_template.format(question=query, context=context).strip()\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "97d35dec-c25f-472d-b961-20d5c30902ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "def llm(prompt):\n",
    "    response = openai_client.chat.completions.create(\n",
    "        model='gpt-4o-mini',\n",
    "        messages=[{\"role\": \"user\", \"content\": prompt}]\n",
    "    )\n",
    "    \n",
    "    return response.choices[0].message.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "8602f40b-ad3b-49c9-b3cc-051a79c888bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rag(query):\n",
    "    search_results = search(query)\n",
    "    prompt = build_prompt(query, search_results)\n",
    "    answer = llm(prompt)\n",
    "    return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "5fd4497b-c5d5-4258-b950-6b35d1af4ec5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'To run Kafka, you can execute the following command in your project directory for the Java Kafka producer:\\n\\n```\\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\\n```\\n\\nFor Python, ensure that you create a virtual environment and run the necessary dependencies as indicated. First, create the virtual environment and activate it:\\n\\n```\\npython -m venv env\\nsource env/bin/activate  # On MacOS/Linux\\n# or\\nenv\\\\Scripts\\\\activate  # On Windows\\n```\\n\\nThen, install the required packages from `requirements.txt`. After setting up the environment, you can run your Python scripts. Remember that Docker images should be up and running before executing any Python files.'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag('how do I run kafka?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "385b012f-4905-422d-8d7c-3d542dfe5a7c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Yes, you can still enroll in the course even if it has already started. You are also eligible to submit the homework assignments. However, make sure to pay attention to the deadlines for turning in the final projects to avoid leaving everything until the last minute.'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag('the course has already started, can I still enroll?')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71154c38-cc68-4d29-9809-f0f0545c79c3",
   "metadata": {},
   "source": [
    "## RAG with Vector Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a78f5cb1-709c-4b48-a9b9-1738b75415de",
   "metadata": {},
   "outputs": [],
   "source": [
    "from qdrant_client import QdrantClient, models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "61784d3a-b8f1-45b2-9404-e3495dca1152",
   "metadata": {},
   "outputs": [],
   "source": [
    "qd_client = QdrantClient(\"http://localhost:6333\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "f435b8da-834e-45c1-b83e-e28c294c044d",
   "metadata": {},
   "outputs": [],
   "source": [
    "EMBEDDING_DIMENSIONALITY = 512\n",
    "model_handle = \"jinaai/jina-embeddings-v2-small-en\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b4fa85d4-af1d-46b9-a281-87c5f332bef7",
   "metadata": {},
   "outputs": [],
   "source": [
    "collection_name = \"zoomcamp-faq\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fce5957c-22c2-43e6-8dc1-af0c1acc884e",
   "metadata": {},
   "outputs": [],
   "source": [
    "qd_client.delete_collection(collection_name=collection_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "722e75a1-95ab-4388-94c0-800ec4f58866",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qd_client.create_collection(\n",
    "    collection_name=collection_name,\n",
    "    vectors_config=models.VectorParams(\n",
    "        size=EMBEDDING_DIMENSIONALITY,\n",
    "        distance=models.Distance.COSINE\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "cb29d2a0-64df-4ea2-8920-8e6a16c4bcd4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qd_client.create_payload_index(\n",
    "    collection_name=collection_name,\n",
    "    field_name=\"course\",\n",
    "    field_schema=\"keyword\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1348ef23-364a-4719-94fe-7a9cf8ea8371",
   "metadata": {},
   "outputs": [],
   "source": [
    "points = []\n",
    "\n",
    "for i, doc in enumerate(documents):\n",
    "    text = doc['question'] + ' ' + doc['text']\n",
    "    vector = models.Document(text=text, model=model_handle)\n",
    "    point = models.PointStruct(\n",
    "        id=i,\n",
    "        vector=vector,\n",
    "        payload=doc\n",
    "    )\n",
    "    points.append(point)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "e5e0e480-2af4-4a0e-b3d0-28a1e2181195",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "qd_client.upsert(\n",
    "    collection_name=collection_name,\n",
    "    points=points\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "2c9be7f7-2813-432b-8d73-3dff448dd02e",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = 'I just discovered the course. Can I still join it?'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "6698670a-6fe7-4f51-83f7-281e80e06f1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def vector_search(question):\n",
    "    print('vector_search is used')\n",
    "    \n",
    "    course = 'data-engineering-zoomcamp'\n",
    "    query_points = qd_client.query_points(\n",
    "        collection_name=collection_name,\n",
    "        query=models.Document(\n",
    "            text=question,\n",
    "            model=model_handle \n",
    "        ),\n",
    "        query_filter=models.Filter( \n",
    "            must=[\n",
    "                models.FieldCondition(\n",
    "                    key=\"course\",\n",
    "                    match=models.MatchValue(value=course)\n",
    "                )\n",
    "            ]\n",
    "        ),\n",
    "        limit=5,\n",
    "        with_payload=True\n",
    "    )\n",
    "    \n",
    "    results = []\n",
    "    \n",
    "    for point in query_points.points:\n",
    "        results.append(point.payload)\n",
    "    \n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "d80d18e3-c512-4f97-9f77-b1145fdb73bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rag(query):\n",
    "    search_results = vector_search(query)\n",
    "    prompt = build_prompt(query, search_results)\n",
    "    answer = llm(prompt)\n",
    "    return answer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "90bdc6f4-b6f8-4491-84f4-bbd8a6b9f6d3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vector_search is used\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'To run Kafka, you need to follow these steps based on your scripts:\\n\\n1. Make sure your Kafka broker is running. You can confirm this by running `docker ps`. If the broker is not active, navigate to the folder with your docker-compose yaml file and run `docker compose up -d` to start all instances.\\n\\n2. In your project directory, to run the producer, use the following command:\\n   ```\\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\\n   ```\\n\\n3. Ensure that the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` in your Java scripts (e.g., JsonProducer.java, JsonConsumer.java) is set to the correct server URL. Also, verify that the cluster key and secrets in `src/main/java/org/example/Secrets.java` are updated with the correct values.\\n\\nBy following these steps, you should be able to run Kafka successfully.'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rag('how do I run kafka?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "886901ee-6e55-4295-a106-af25e6483da5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
